{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.sys.path.append(os.path.dirname(os.path.abspath('.')))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据准备"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(455, 30) (114, 30) (455,) (114,)\n"
     ]
    }
   ],
   "source": [
    "from model_selection.train_test_split import train_test_split\n",
    "import numpy as np\n",
    "\n",
    "from datasets.dataset import load_breast_cancer\n",
    "data = load_breast_cancer()\n",
    "X, Y = data.data, data.target\n",
    "Y[Y == 0] = -1    # 注意adaboost用的是正负1标签\n",
    "del data\n",
    "\n",
    "X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)\n",
    "print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 模型基础\n",
    "sklearn中的boosting算法默认的基模型为决策树桩，由于我们还没有在决策树模块中实现```max_depth```参数，首先来看一下sklearn中的决策树桩表现如何："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from sklearn.tree import DecisionTreeClassifier\n",
    "# tree_stump=DecisionTreeClassifier(criterion='entropy',max_depth=1)\n",
    "# tree_stump.fit(X_train,Y_train)\n",
    "# Y_pred=tree_stump.predict(X_test)\n",
    "\n",
    "# print('stump acc:{}'.format(np.sum(Y_pred==Y_test)/len(Y_test)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "出于还未实现```max_depth```参数的考量，暂时使用sklearn中的决策树桩来实现一个简单的boosting算法。注意AdaBoost的关键在于权重的更新，并且样本与模型都带有权重，但是最终预测起作用的只有模型的权重。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def update_w(w, Y_train, Y_pred):\n",
    "#     weight_err = np.sum(w*(Y_train != Y_pred))/np.sum(w)    # 加权训练误差\n",
    "#     alpha = np.log(1/weight_err-1)\n",
    "#     w = w*np.exp(alpha*(Y_train != Y_pred))\n",
    "#     w=w/np.sum(w)    # 归一化\n",
    "\n",
    "#     return w, alpha\n",
    "\n",
    "\n",
    "# # w = np.array([1/3, 1/3, 1/3])\n",
    "# # Y_true = np.array([0, 0, 0])\n",
    "# # Y_pred = np.array([0, 0, 1])\n",
    "# # update_w(w, Y_true, Y_pred)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "开始串行训练模型并更新参数："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# n_samples, n_features = X_train.shape\n",
    "\n",
    "# K = 5    # 模型数量\n",
    "# w = np.array([1/n_samples]*n_samples)    # 初始样本权重\n",
    "# alphas = list()    # 初始模型权重\n",
    "# base_models = list()\n",
    "\n",
    "# for i in range(K):\n",
    "#     cur_model = DecisionTreeClassifier(max_depth=1)\n",
    "#     cur_model.fit(X_train, Y_train,sample_weight=w)    # 样本权重将用于树的分裂过程\n",
    "#     cur_pred = cur_model.predict(X_train)\n",
    "\n",
    "#     w, alpha = update_w(w, Y_train, cur_pred)\n",
    "#     alphas.append(alpha)\n",
    "#     base_models.append(cur_model)\n",
    "    \n",
    "# # print(w)\n",
    "# # print(alphas)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# for i in range(K):\n",
    "#     if i==0:\n",
    "#         Y_pred=alphas[i]*base_models[i].predict(X_test)\n",
    "#     else:\n",
    "#         Y_pred=np.c_[Y_pred,alphas[i]*base_models[i].predict(X_test)]\n",
    "        \n",
    "# Y_pred=np.sign(np.sum(Y_pred,axis=1))\n",
    "\n",
    "# print('ada acc:{}'.format(np.sum(Y_pred==Y_test)/len(Y_test)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "好，借助sklearn的帮助，实现adaboost算法很简单，但是该库的目的是使用Python与numpy实现sklearn的功能，从上述代码中可以看出，关键在于实现树算法中的```max_depth```与```sample_weight```参数，前者用于生成决策树桩，后者用于控制树分裂时样本的权重。该部分的指导详见```../tree/add_param.ipynb```。\n",
    "\n",
    "接下来引入我们自己实现的决策树模型来构建adaboost分类器。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "stump acc:0.9035087719298246\n"
     ]
    }
   ],
   "source": [
    "from tree.DecisionTreeClassifier import DecisionTreeClassifier\n",
    "tree_stump = DecisionTreeClassifier(max_depth=1)\n",
    "tree_stump.fit(X_train, Y_train)\n",
    "Y_pred = tree_stump.predict(X_test)\n",
    "\n",
    "print('stump acc:{}'.format(np.sum(Y_pred == Y_test)/len(Y_test)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "权重更新函数仍然不变，使用上面同样的函数："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def update_w(w, Y_train, Y_pred):\n",
    "    weight_err = np.sum(w*(Y_train != Y_pred))/np.sum(w)    # 加权训练误差\n",
    "    alpha = np.log(1/weight_err-1)\n",
    "    w = w*np.exp(alpha*(Y_train != Y_pred))\n",
    "    w = w/np.sum(w)    # 归一化\n",
    "\n",
    "    return w, alpha"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_samples, n_features = X_train.shape\n",
    "\n",
    "K = 5    # 模型数量\n",
    "w = np.array([1/n_samples]*n_samples)    # 初始样本权重\n",
    "alphas = list()    # 初始模型权重\n",
    "base_models = list()\n",
    "\n",
    "for i in range(K):\n",
    "    cur_model = DecisionTreeClassifier(max_depth=1)\n",
    "    cur_model.fit(X_train, Y_train, sample_weight=w)    # 样本权重将用于树的分裂过程\n",
    "    cur_pred = cur_model.predict(X_train)\n",
    "\n",
    "    w, alpha = update_w(w, Y_train, cur_pred)\n",
    "    alphas.append(alpha)\n",
    "    base_models.append(cur_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ada acc:0.9210526315789473\n"
     ]
    }
   ],
   "source": [
    "for i in range(K):\n",
    "    if i == 0:\n",
    "        Y_pred = alphas[i]*base_models[i].predict(X_test)\n",
    "    else:\n",
    "        Y_pred = np.c_[Y_pred, alphas[i]*base_models[i].predict(X_test)]\n",
    "\n",
    "Y_pred = np.sign(np.sum(Y_pred, axis=1))\n",
    "\n",
    "print('ada acc:{}'.format(np.sum(Y_pred == Y_test)/len(Y_test)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "hide_input": false,
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
